Table of contents :

In [2]:
# Import custom packages & modules
import sys
sys.path.append("..")
# Univariate analysis module
from src.analyzer.univariate import *
# Multivariate analysis module
from src.analyzer.multivariate import *
# Data cleaning module
from src.datacleaner import *
# Data preprocessing module
from src.preprocessor import *
# Model evaluation module
from src.evaluator import *
# Ensemble-based methods visualization module
from src.modelizer.ensemble.tree_interpreter import *
# Model selection
from sklearn.model_selection import train_test_split
# Linear models
from sklearn.linear_model import ElasticNet, ElasticNetCV
# Non-linear models
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
# Ensemble-based methods
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Compute training time
import datetime
In [2]:
# Model evaluation wrappers

# Define project scorers 
third_project_scorers = ['neg_root_mean_squared_error', 'r2']


def train_gridsearch(data, model, param_grid, metric=third_project_scorers, k=10, p=3, v=True):
    # Model name
    model_label = model_name(model)
    # Get training & testing data
    x_train, y_train = data['train']
    x_test, y_test = data['test']
    # Define refit condition (first metric if evaluationg multiple metrics else False)
    refit_cond = metric[0] if type(metric) is list else False
    # Build grid search
    gridsearch = GridSearchCV(model, param_grid, cv=k, scoring=metric, refit=refit_cond)
    # Time the model training
    start_training = datetime.datetime.now()
    # Train model with grid search
    gridsearch.fit(x_train, y_train)
    end_training = datetime.datetime.now()
    # Compute training time
    training_time = end_training - start_training
    # Format training time
    training_time_str = format_run_time(training_time)
    # Trained_model
    trained_model = gridsearch.best_estimator_
    # Get scores from cross validation
    cv_scores = {}
    for scorer_label in (metric if type(metric) is list else [metric]):
        if scorer_label.startswith('neg'):
            formatted_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
            formatted_score = round(np.abs(gridsearch.cv_results_[f'mean_test_{scorer_label}'])[0], p)
            cv_scores[formatted_label] = formatted_score
        else:
            cv_scores[scorer_label] = round(gridsearch.cv_results_[f'mean_test_{scorer_label}'][0], p)
    # Get scores from testing set
    testing_set_scores = get_model_scores(trained_model, x_test, y_test, list(cv_scores.keys()), p, v)
    # Display cross validation mean scores
    if v:
        print_score_results(cv_scores, set_type='train')
    # Build model dictionary which contains GridSearchCV & model instances (with model name)
    model_data = {'gs': gridsearch,                                 # GridSearchCV trained instance
                  'model': trained_model,                           # Model trained instance
                  'model_name': model_label}                        # Model name
    # Build additional evaluation data dictionary
    additional_evaluation_data = {'time': training_time_str,        # Training time
                                  'n_features': x_train.shape[1],   # Selected features
                                  'learning_potential': None}       # Learning potential
    # Build results dictionary (merge dictionnaries)
    results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
    return results


# To do : refacto (cumsum is a "threshold")
def select_most_important_features(features_coefs_df,
                                   n=None,
                                   method='cumsum',
                                   model=None,
                                   thr='mean',
                                   q_value=0.5,
                                   thr_value=None,
                                   v=False):
    """
    Wrapper which select most important features from machine learning model, based on specified method
    (cumulative threshold or)
    """
    # Feature importance selection method
    if method is 'cumsum':                                # if thr is 'cumsum'
        n_mif_data = features_coefs_df.iloc[:n]
        n_mif_labels = n_mif_data['feature'].tolist()
    elif method is 'threshold':                           # else
        # Threshold method
        if thr is 'q': # quantile
            mif_thr = features_coefs_df['coefficient'].quantile(q=q_value)
        elif thr is 'mean': # mean
            mif_thr = features_coefs_df['coefficient'].mean()
        else: # arbitrary number
            mif_thr = thr_value
        # N.B : function from mlearn preprocessor module
        n_mif_data = filter_features_by_threshold(X,
                                                  X_train_std,
                                                  X_test_std,
                                                  model,
                                                  mif_thr,
                                                  verbose=v)
        n_mif_labels = n_mif_data['labels']
    # Return n selected features dataframe & labels
    return n_mif_data, n_mif_labels


def run_training_cycle(features_reduced,
                       model_param_grid,
                       model=None,
                       target='energy',
                       training_wrapper='gs',
                       k=5,
                       s=third_project_scorers,
                       p=3,
                       v=True):
    """
    Wrapper which run a training cycle based on reduced features
    """
    # Filter training and testing features data (reduce the number of features)
    X_train_std_reduced = X_train_std_df.loc[:, features_reduced]
    X_test_std_reduced = X_test_std_df.loc[:, features_reduced]
    # Define target type
    if target is 'energy':
        y_train_target, y_test_target = y_train_energy, y_test_energy
    elif target is 'emissions':
        y_train_target, y_test_target = y_train_emissions, y_test_emissions
    # Build training & testing data dictionary for each target   
    training_and_testing_data_reduced = {"train": [X_train_std_reduced, y_train_target],
                                         "test": [X_test_std_reduced, y_test_target]}
    # Define training wrapper type (elastic net cross val or gridsearchCV)
    if training_wrapper == 'en':
        results = train_elastic_net(training_and_testing_data_reduced,
                                    model_param_grid)
    elif training_wrapper == 'gs':
        results = train_gridsearch(training_and_testing_data_reduced,
                                   model,
                                   model_param_grid,
                                   metric=s,
                                   k=k,       
                                   p=p,
                                   v=v)
    return results, training_and_testing_data_reduced


# Elastic Net Wrapper

def train_elastic_net(data, params_grid, k=10, v=True, s=['rmse', 'r2']):
    # Get training and testing sets from data dictionary
    X_train_std, y_train = data["train"]
    X_test_std, y_test = data["test"]
    # Get (hyper)parameters from parameters grid
    alpha_range, l1_ratio_range = params_grid.values()
    # Build model using cross validation 
    elastic_net = ElasticNetCV(alphas=alpha_range,
                               cv=k,
                               l1_ratio=l1_ratio_range)
    start_training = datetime.datetime.now()
    # Train model with cross validation
    elastic_net.fit(X_train_std, y_train)
    end_training = datetime.datetime.now()
    # Compute training time
    training_time = end_training - start_training
    # Format training time
    training_time_str = format_run_time(training_time)
    # Optimal parameters
    optimal_alpha = elastic_net.alpha_
    optimal_l1_ratio = elastic_net.l1_ratio_
    if v:
        print("Alpha : {} | l1_ratio : {}\n".format(optimal_alpha,
                                                    optimal_l1_ratio))
    # Regularization path
    alphas, coefs, _ = elastic_net.path(X_train_std,
                                        y_train,
                                        alphas=alpha_range,
                                        l1_ratio=optimal_l1_ratio)
    # Format scorer label (example : neg_root_mean_squared_error --> rmse)
    formatted_s = []
    for scorer_label in s:
        if scorer_label.startswith('neg'):
            scorer_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
        formatted_s.append(scorer_label)
    # Model evaluation
    testing_set_scores = get_model_scores(elastic_net,
                                          X_test_std,
                                          y_test,
                                          scorer=formatted_s,
                                          verbose=v)
    # Build model dictionary which contains model data (instance & name)
    model_data = {'model': elastic_net,
                  'model_name': model_name(elastic_net)}
    # Build additional evaluation data dictionary
    additional_evaluation_data = {'time': training_time_str,            # Training time
                                  'n_features': X_train_std.shape[1],   # Selected features
                                  'learning_potential': None,           # Learning potential
                                  'best_params': [optimal_alpha, optimal_l1_ratio],
                                  'reg_path_data': [alphas, coefs]}            
    # Build results dictionary (merge dictionnaries) 
    results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
    return results
In [3]:
# Import du dataset
df_raw = pd.read_csv('../data/csv/seattle_model_data_no_ENERGYSTARScore.csv')
df = df_raw.copy()
print(df.shape)
df.head(2)
(6129, 213)
Out[3]:
PropertyGFATotal LargestPropertyUseTypeGFA SecondLargestPropertyUseTypeGFA ThirdLargestPropertyUseTypeGFA NumberofFloors SiteEnergyUse(kBtu) TotalGHGEmissions Main_energy_electricity Main_energy_gaz Main_energy_steam ... Neighborhood_Downtown Neighborhood_East Neighborhood_Greater duwamish Neighborhood_Lake union Neighborhood_Magnolia / queen anne Neighborhood_North Neighborhood_Northeast Neighborhood_Northwest Neighborhood_Southeast Neighborhood_Southwest
0 10.030957 10.030957 0.000000 0.0 3 14.121906 2.248129 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 12.685002 12.114253 11.780996 0.0 5 16.653732 6.024295 1 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 213 columns

In [4]:
# Targets
targets_cols = ['SiteEnergyUse(kBtu)', 'TotalGHGEmissions']

# Features
features_cols = [col for col in df.columns if col not in targets_cols]

# Features data
df_without_targets = df[features_cols]

I - Preprocessing

1 - Feature Selection

1.1 - Filter features with identical or null variance

In [5]:
# N.B : functions from mlearn prepocessor module 

# Method :
# null_variance_cols = features_with_null_variances(df, verbose=True)
# identical_variance_cols = features_with_identical_variances(df, col_kept='last', verbose=True)
# invalid_variances_cols = null_variance_cols + identical_variance_cols
# df = df[[col for col in df.columns if col not in invalid_variances_cols]]

# Remove invalid features from dataframe
df = filter_invalid_variances(df, feature_kept='last', v=True)
0/213 features with null variance, reduction of 0.0%
9/213 features with identical variance, reduction of 4.2%

1.2 - Filter correlated features

In [6]:
# N.B : functions from mlearn prepocessor module 

# Get filtered features dataframe
features = filter_correlated_features(df_without_targets, threshold=0.5, verbose=True)
# features = df_without_targets 
38/211 features correlated, reduction of 18.0%
In [7]:
features.shape
Out[7]:
(6129, 173)
In [8]:
# Remove energy variables
delete_cols(features, ['Main_energy_electricity', 'Main_energy_steam'])
/home/adil/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3997: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

2 - Split training set & testing set

In [9]:
# Features data
X = features

# Extract features labels
training_features = X.columns.tolist()

# Targets data
y = df[targets_cols]

# Training & testing sets split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split targets (energy consumption & gaz emissions features)
y_train_energy, y_train_emissions = [y_train[target] for target in targets_cols]
y_test_energy, y_test_emissions = [y_test[target] for target in targets_cols]

3 - Feature Scaling (standardize data)

In [10]:
# Standardize data (center & reduce)
# N.B : function from mlearn preprocessor module
X_train_std, std_scaler = standard_scaler(X_train, return_std_scaler=True)
X_test_std = std_scaler.transform(X_test)

# Build training & testing data dictionary for each target
energy_data = {"train": [X_train_std, y_train_energy],
               "test": [X_test_std, y_test_energy]}

emissions_data = {"train": [X_train_std, y_train_emissions],
                  "test": [X_test_std, y_test_emissions]}
In [11]:
# Build dataframes from standardized training & testing data
# => permet de faciliter le filtrage des features qui contribuent le plus au model
X_train_std_df = pd.DataFrame(X_train_std, columns=training_features)
X_test_std_df = pd.DataFrame(X_test_std, columns=training_features)

II - Dummies regressions :

1 - Dummy regression ( target : 'SiteEnergyUse(kBtu)' )

In [12]:
y_pred_dum_energy, y_test_dum_energy = dummy_regression(X_train_std,
                                                        y_train_energy,
                                                        X_test_std,
                                                        y_test_energy,
                                                        strategy='mean')

print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_energy,
                                                               y_pred_dum_energy,
                                                               squared=False)))

print('Dummy regression : R2 = {}'.format(r2_score(y_test_energy, y_pred_dum_energy)))
Dummy regression : RMSE = 1.1270469116069959
Dummy regression : R2 = -0.0002167772201731033

2 - Dummy regression ( target : 'TotalGHGEmissions' )

In [13]:
y_pred_dum_emissions, y_test_dum_emissions = dummy_regression(X_train_std,
                                                               y_train_emissions,
                                                               X_test_std,
                                                               y_test_emissions,
                                                               strategy='mean')

print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_emissions,
                                                               y_pred_dum_emissions,
                                                               squared=False)))

print('Dummy regression : R2 = {}'.format(r2_score(y_test_emissions, y_pred_dum_emissions)))
Dummy regression : RMSE = 1.488466479230585
Dummy regression : R2 = -6.541434733620122e-05

III - Modelize :

Methodology : linear model & ensemble methods :

For each target we carry out the following cycle :

  • Baseline model training (default (hyper)parameters)
  • Optimization of hyperparameters cyclically (progressive reduction of the range of values applied to grid search) with cross validation
  • Model performance evaluation (RMSE, R2, run time)
  • Model features contributions evaluation
  • Feature reduction :
    • conversion of negative coefficients into positive coefficients (np.abs())
    • filtering of zero coefficients
    • application of thresholds (modal class, median, third quartile)
  • Second training cycle from reduced features
  • Best model selection by a trade-off between total features & score (RMSE, R2))
  • Training & learning curves evaluation
  • Define model learning potential from learning curve evaluation ('Yes'/'No')

1 - Linear model : ElasticNet

1.1 - ElasticNet (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Train model (optimize hyperparameters)

In [14]:
# alphas range : np.logspace(-3, 3, 10), np.arange(0.001, 0.009, 0.001)
enet_en_params_grid = {"alphas": np.logspace(-3, 3, 10),
                       "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}

enet_en_data = train_elastic_net(energy_data, enet_en_params_grid)
Alpha : 0.004641588833612777 | l1_ratio : 0.8

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.559
- R2 = 0.754

1.1.2 - Regularization path

In [ ]:
en_alphas, en_coefs = enet_en_data["reg_path_data"]

# N.B : function from mlearn evaluator module
plot_regularization_path(en_alphas, en_coefs.T, training_features, n_features_labels=10)

1.1.3 - Select most important features (method : cumulative feature importance selection)

In [15]:
# N.B : Elastic Net reduces the coefficients of irrelevant features to 0. 
# We have therefore selected here features with positive or negative coefficients

elastic_net_en_coefs = enet_en_data["model"].coef_

elastic_net_en_features_coefs_df = get_features_importance(training_features,
                                                           elastic_net_en_coefs,
                                                           abs_coefs=True,
                                                           non_zero_coefs=True,
                                                           verbose=True)
# N.B : function from mlearn preprocessor module
plot_cumulative_features_importance(elastic_net_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
123/171 features selected, reduction of 28.1%
In [16]:
# Select the 64 most important features
enet_en_mif_61_data = elastic_net_en_features_coefs_df.iloc[:61]
enet_en_mif_61_labels = enet_en_mif_61_data['feature'].tolist()
In [17]:
# Visualize n most important features
# N.B : function from mlearn preprocessor module
plot_n_top_features(enet_en_mif_61_data,
                    model_name(enet_en_data["model"]),
                    n=10,
                    plot_size=(12, 4))

1.1.4 - Second training cycle (with cumulative feature importance selection method)

In [18]:
# Second training cycle with features reduced (173 -> 61)

X_train_std_en_61f = X_train_std_df.loc[:, enet_en_mif_61_labels]
X_test_std_en_61f = X_test_std_df.loc[:, enet_en_mif_61_labels]


# Build training & testing data dictionary for each target
en_data_reduced_61f = {"train": [X_train_std_en_61f, y_train_energy],
                       "test": [X_test_std_en_61f, y_test_energy]}


enet_en_reduced_data_61f = train_elastic_net(en_data_reduced_61f, enet_en_params_grid)
Alpha : 0.004641588833612777 | l1_ratio : 0.2

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.563
- R2 = 0.75

1.1.5 - Select most important features (method : threshold feature importance selection)

In [19]:
# We use the same model (from first training cycle)
mif_q3 = elastic_net_en_features_coefs_df['coefficient'].quantile(q=0.75)
ffd = filter_features_by_threshold(X, X_train_std, X_test_std, enet_en_data['model'], mif_q3)
print('Features selected : {}'.format(len(ffd['labels'])))
enet_en_mif_31_labels = ffd['labels']
Features selected : 31

1.1.6 - Second training cycle (with threshold feature selection method)

In [20]:
# Second training cycle with features reduced (173 -> 31)

X_train_std_en_31f = X_train_std_df.loc[:, enet_en_mif_31_labels]
X_test_std_en_31f = X_test_std_df.loc[:, enet_en_mif_31_labels]


# Build training & testing data dictionary for each target
en_data_reduced_31f = {"train": [X_train_std_en_31f, y_train_energy],
                       "test": [X_test_std_en_31f, y_test_energy]}


enet_en_reduced_data_31f = train_elastic_net(en_data_reduced_31f, enet_en_params_grid)
# Cumulative feature importance selection method seems to be slightly better
# (fewer variables selected and a slightly better r2)
Alpha : 0.001 | l1_ratio : 0.2

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.559
- R2 = 0.754

1.1.7 - Training curve

In [21]:
# Plot training curve  (best model selected by a trade-off between total features & score (RMSE, R2))

# N.B : function from mlearn evaluator module
plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
                      X_train_std_en_31f,
                      y_train_energy,
                      'alpha',
                      np.logspace(-3, 3, 10),
                      log_scale=True,
                      scorer='neg_root_mean_squared_error')

1.1.8 - Learning curve

In [22]:
# Plot learning curve  (best model selected by a trade-off between total features & score (RMSE, R2))

# N.B : function from mlearn evaluator module
plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
                    X_train_std_en_31f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
# Model performance seems to stagnate from around 4 000 observations
In [23]:
enet_en_reduced_data_31f['learning_potential'] = 'No'

1.2 - ElasticNet (target : 'TotalGHGEmissions' )

N.B : From here we will use wrappers functions in order to :

  • select the most important features
  • repeat training cycles

1.2.1 - Train model (optimize hyperparameters)

In [24]:
# Train model

enet_em_params_grid = {"alphas": np.logspace(-3, 3, 10),
                       "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}

enet_em_data = train_elastic_net(emissions_data, enet_em_params_grid)
Alpha : 0.004641588833612777 | l1_ratio : 0.99

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.983
- R2 = 0.564

1.2.2 - Regularization path

In [25]:
em_alphas, em_coefs = enet_em_data["reg_path_data"]

plot_regularization_path(em_alphas, em_coefs.T, training_features, n_features_labels=10)

1.2.3 - Select most important features (method : cumulative feature importance selection)

In [26]:
elastic_net_em_coefs = enet_em_data["model"].coef_

elastic_net_em_features_coefs_df = get_features_importance(training_features,
                                                           elastic_net_em_coefs,
                                                           abs_coefs=True,
                                                           non_zero_coefs=True,
                                                           verbose=True)

plot_cumulative_features_importance(elastic_net_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
142/171 features selected, reduction of 17.0%
In [27]:
enet_em_mif_70_data, enet_em_mif_70_labels = select_most_important_features(elastic_net_em_features_coefs_df,
                                                                            n=70,
                                                                            method='cumsum')
# Visualize n most important features
plot_n_top_features(enet_em_mif_70_data,
                    model_name(enet_em_data["model"]),
                    n=10,
                    plot_size=(12, 4))

1.2.4 - Second training cycle (with cumulative feature importance selection method)

In [28]:
# Second training cycle with features reduced (173 -> 70)

# en_params_grid = {"alphas": np.arange(0.0020, 0.0030, 0.0001),
#                   "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}

enet_em_reduced_data_70f, train_and_test_data_reduced_70f  = run_training_cycle(enet_em_mif_70_labels,
                                                                                enet_em_params_grid,
                                                                                training_wrapper='en') 
Alpha : 0.004641588833612777 | l1_ratio : 0.2

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.565
- R2 = 0.748

1.2.5 - Select most important features (method : threshold feature importance selection)

In [29]:
enet_em_mif_n_data, enet_em_mif_n_labels = select_most_important_features(elastic_net_em_features_coefs_df,
                                                                          method='threshold',
                                                                          model=enet_em_data["model"],
                                                                          thr='q',
                                                                          q_value=0.75, # Q3
                                                                          v=True)
36/171 selected features, reduction of 21.1%

1.2.6 - Second training cycle (with threshold feature selection method)

In [30]:
# Second training cycle with features reduced (173 -> 36)

enet_em_reduced_data_36f, train_and_test_data_reduced_36f  = run_training_cycle(enet_em_mif_n_labels,
                                                                                enet_em_params_grid,
                                                                                training_wrapper='en') 
Alpha : 0.001 | l1_ratio : 0.2

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.572
- R2 = 0.743

1.2.7 - Training curve

In [31]:
# Plot training curve

x_train_std_em_36f = train_and_test_data_reduced_36f['train'][0]

plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
                      x_train_std_em_36f,
                      y_train_emissions,
                      'alpha',
                      np.logspace(-3, 3, 10),
                      log_scale=True,
                      scorer='neg_root_mean_squared_error')

1.2.8 - Learning curve

In [32]:
# Plot learning curve

plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
                    x_train_std_em_36f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
# The curve stagnates around 3500 observations
In [33]:
enet_em_reduced_data_36f['learning_potential'] = 'No'

2 - Ensemble methods : Random forest, XGBoost

1.1 - Random Forest (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Train model (optimize hyperparameters)

In [34]:
%%time 

# Random Forest

rfr_model = RandomForestRegressor(random_state=42)

#    n_estimators range      |       max_depth range
#  np.arange(200, 550, 50)   |      [2, 4, 6, 8, 10]
# [400, 500, 600]

rfr_en_params = {'n_estimators': [100], # 250
#                'max_depth': [150],
#                'max_features': ['auto', 'sqrt', 'log2']
                 'min_samples_leaf': [1]}

rfr_en_data = train_gridsearch(energy_data,
                               rfr_model,
                               rfr_en_params,
                               k=10)

# print(rfr_en_data['model'])
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.47
- R2 = 0.826
--------------------------------
Training set performances (CV) :
- RMSE = 0.432
- R2 = 0.85
CPU times: user 49.2 s, sys: 133 ms, total: 49.3 s
Wall time: 49.4 s

1.1.2 - Select most important features (method : cumulative feature importance selection)

In [35]:
rfr_en_model = rfr_en_data['model']
rfr_en_coefs = rfr_en_model.feature_importances_

rfr_en_features_coefs_df = get_features_importance(training_features,
                                                   rfr_en_coefs,
                                                   abs_coefs=True,
                                                   non_zero_coefs=False,
                                                   verbose=False)

plot_cumulative_features_importance(rfr_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [36]:
rfr_en_mif_12_data, rfr_en_mif_12_labels = select_most_important_features(rfr_en_features_coefs_df,
                                                                          n=12,
                                                                          method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_en_mif_12_data,
                    model_name(rfr_en_model),
                    n=10,
                    plot_size=(12, 4))

1.1.3 - Second training cycle (with cumulative feature importance selection method)

In [37]:
# Second training cycle with features reduced (173 -> 12)

rfr_en_model_c2 = RandomForestRegressor(random_state=42)

rfr_en_reduced_data_12f, train_and_test_data_reduced_12f  = run_training_cycle(rfr_en_mif_12_labels,
                                                                               rfr_en_params,
                                                                               rfr_en_model_c2) 
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.48
- R2 = 0.818
--------------------------------
Training set performances (CV) :
- RMSE = 0.476
- R2 = 0.82

1.1.4 - Display random decision tree (from cumulative feature importance selection method)

In [38]:
# Display naiv decision tree
rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_12f['train'])

display_decision_tree(rfr_en_naiv_model,
                      rfr_en_mif_12_labels,
                      targets_cols[0],
                      tree_nb=5)
Out[38]:

1.1.5 - Select most important features (method : threshold feature importance selection)

In [39]:
rfr_en_mif_n_data, rfr_en_mif_n_labels = select_most_important_features(rfr_en_features_coefs_df,
                                                                        method='threshold',
                                                                        model=rfr_en_model,
                                                                        thr='q',
                                                                        q_value=0.75, # Q3
                                                                        v=True)
43/171 selected features, reduction of 25.1%

1.1.6 - Second training cycle (with threshold feature selection method)

In [40]:
# Second training cycle with features reduced (173 -> 43)

rfr_en_model_c2 = RandomForestRegressor(random_state=42, n_jobs=-1)

rfr_en_reduced_data_43f, train_and_test_data_reduced_43f  = run_training_cycle(rfr_en_mif_n_labels,
                                                                               rfr_en_params,
                                                                               rfr_en_model_c2)
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.475
- R2 = 0.822
--------------------------------
Training set performances (CV) :
- RMSE = 0.457
- R2 = 0.834

1.1.7 - Display random decision tree (from threshold feature selection method)

In [41]:
# Display naiv decision tree

rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_43f['train'])

display_decision_tree(rfr_en_naiv_model,
                      rfr_en_mif_n_labels,
                      targets_cols[0],
                      tree_nb=5)
Out[41]:

1.1.8 - Training curve

In [ ]:
# Plot training curve

x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]

plot_validation_curve(RandomForestRegressor(n_estimators=100, random_state=42),
                      x_train_std_en_12f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(50, 500, 50),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

1.1.9 - Learning curve

In [ ]:
# Plot learning curve

plot_learning_curve(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
                    x_train_std_en_12f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [42]:
rfr_en_reduced_data_12f['learning_potential'] = 'Yes'

1.2 - Random Forest (target : 'TotalGHGEmissions' )

1.2.1 - Train model (optimize hyperparameters)

In [43]:
%%time 

# Random Forest

rfr_model = RandomForestRegressor(random_state=42) #,n_jobs=-1)

#    n_estimators range      |       max_depth range
#  np.arange(200, 550, 50)   |      [2, 4, 6, 8, 10]
# [400, 500, 600]

rfr_em_params = {'n_estimators': [350], # 
#                'max_depth': [40],
#                'max_features': ['auto', 'sqrt', 'log2']
                 'min_samples_leaf': [1]}

rfr_em_data = train_gridsearch(emissions_data,
                               rfr_model,
                               rfr_em_params,
                               k=10)

print(rfr_em_data['model'])
"""
- RMSE = 0.726 722
- R2 = 0.762
"""
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.722
- R2 = 0.765
--------------------------------
Training set performances (CV) :
- RMSE = 0.735
- R2 = 0.749
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=350, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
CPU times: user 2min 36s, sys: 749 ms, total: 2min 37s
Wall time: 2min 37s
Out[43]:
'\n- RMSE = 0.726 722\n- R2 = 0.762\n'

1.2.2 - Select most important features (method : cumulative feature importance selection)

In [44]:
rfr_em_model = rfr_em_data['model']
rfr_em_coefs = rfr_em_model.feature_importances_

rfr_em_features_coefs_df = get_features_importance(training_features,
                                                   rfr_em_coefs,
                                                   abs_coefs=True,
                                                   non_zero_coefs=False,
                                                   verbose=False)

plot_cumulative_features_importance(rfr_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [45]:
rfr_em_mif_18_data, rfr_em_mif_18_labels = select_most_important_features(rfr_em_features_coefs_df,
                                                                          n=18,
                                                                          method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_em_mif_18_data,
                    model_name(rfr_em_model),
                    n=10,
                    plot_size=(12, 4))

1.2.3 - Second training cycle (with cumulative feature importance selection method)

In [46]:
# Second training cycle with features reduced (173 -> 18)

rfr_em_model_c2 = RandomForestRegressor(random_state=42, n_jobs=-1)

rfr_em_reduced_data_18f, train_and_test_data_reduced_18f  = run_training_cycle(rfr_em_mif_18_labels,
                                                                               rfr_em_params,
                                                                               rfr_em_model_c2) 
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.474
- R2 = 0.823
--------------------------------
Training set performances (CV) :
- RMSE = 0.468
- R2 = 0.826

1.2.4 - Display random decision tree (from cumulative feature importance selection method)

In [47]:
# Display naiv decision tree
rfr_em_naiv_model = RandomForestRegressor(n_estimators=350, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_18f['train'])

display_decision_tree(rfr_em_naiv_model,
                      rfr_em_mif_18_labels,
                      targets_cols[1],
                      tree_nb=5)
Out[47]:

1.2.5 - Select most important features (method : threshold feature importance selection)

In [48]:
rfr_em_mif_n_data, rfr_em_mif_n_labels = select_most_important_features(rfr_em_features_coefs_df,
                                                                        method='threshold',
                                                                        model=rfr_em_model,
                                                                        thr='q',
                                                                        q_value=0.75, # Q3
                                                                        v=True)
43/171 selected features, reduction of 25.1%

1.2.6 - Second training cycle (with threshold feature selection method)

In [49]:
# Second training cycle with features reduced (173 -> 43)

rfr_em_model_c2 = RandomForestRegressor(random_state=42)

rfr_em_reduced_data_43f, train_and_test_data_reduced_43f = run_training_cycle(rfr_em_mif_n_labels,
                                                                              rfr_em_params,
                                                                              rfr_em_model_c2)
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.475
- R2 = 0.822
--------------------------------
Training set performances (CV) :
- RMSE = 0.453
- R2 = 0.836

1.2.7 - Display random decision tree (from threshold feature selection method)

In [50]:
# Display naiv decision tree

rfr_em_naiv_model = RandomForestRegressor(n_estimators=350, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_43f['train'])

display_decision_tree(rfr_em_naiv_model,
                      rfr_em_mif_n_labels,
                      targets_cols[1],
                      tree_nb=5)
Out[50]:

1.2.8 - Training curve

In [ ]:
# Plot training curve

x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]

plot_validation_curve(RandomForestRegressor(n_estimators=350, random_state=42, n_jobs=-1),
                      x_train_std_em_18f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(200, 800, 200),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

1.2.9 - Learning curve

In [ ]:
# Plot learning curve

plot_learning_curve(RandomForestRegressor(n_estimators=350, random_state=42, n_jobs=-1),
                    x_train_std_em_18f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [51]:
rfr_em_reduced_data_18f['learning_potential'] = 'Yes'

2.1 - XGBoost (target : 'SiteEnergyUse(kBtu)' )

2.1.1 - Train model (optimize hyperparameters)

In [52]:
%%time

# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# xgb_reg_params = {'n_estimators': [150, 200, 250],
#                   'alpha': [0.5, 2, 5],
#                   'learning_rate': np.arange(0.1, 1.1, 0.1),
#                   'max_depth': [5, 10, 20],
#                   'colsample_bytree': [0.3],
#                   #'subsample': np.arange(0.1, 1.1, 0.1)
#                  }

xgb_reg_en_params = {'n_estimators': [500],
                     'alpha': [0.5], # selected from np.arange(1, 2, 0.1),
                     'learning_rate': [0.062], # 0.065
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                    }

xgb_en_data = train_gridsearch(energy_data,
                               xgb_reg,
                               xgb_reg_en_params)


print(xgb_en_data['model'])
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.418
- R2 = 0.862
--------------------------------
Training set performances (CV) :
- RMSE = 0.396
- R2 = 0.873
XGBRegressor(alpha=0.5, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.35, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.062, max_delta_step=0, max_depth=20,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0.5,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
CPU times: user 7min 19s, sys: 6.88 s, total: 7min 26s
Wall time: 2min 19s

2.1.2 - Select most important features (method : cumulative feature importance selection)

In [53]:
xgb_reg_en_model = xgb_en_data['model']
xgb_reg_en_coefs = xgb_reg_en_model.feature_importances_

xgb_reg_en_features_coefs_df = get_features_importance(training_features,
                                                       xgb_reg_en_coefs,
                                                       abs_coefs=True,
                                                       non_zero_coefs=False,
                                                       verbose=False)

plot_cumulative_features_importance(xgb_reg_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [54]:
xgb_reg_en_mif_72_data, xgb_reg_en_mif_72_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
                                                                                  n=72,
                                                                                  method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_en_mif_72_data,
                    model_name(xgb_reg_en_model),
                    n=10,
                    plot_size=(12, 4))

2.1.3 - Second training cycle (with cumulative feature importance selection method)

In [55]:
%%time 

# Second training cycle with features reduced (173 -> 72)

xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')


xgb_reg_en_reduced_data_72f, train_and_test_data_reduced_72f  = run_training_cycle(xgb_reg_en_mif_72_labels,
                                                                                   xgb_reg_en_params,
                                                                                   xgb_reg_en_model_c2) 
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.425
- R2 = 0.858
--------------------------------
Training set performances (CV) :
- RMSE = 0.424
- R2 = 0.856
CPU times: user 1min 27s, sys: 427 ms, total: 1min 27s
Wall time: 23.3 s

2.1.4 - Display random decision tree (from cumulative feature importance selection method)

In [56]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_72f['model'], num_trees=5)
plt.show()

2.1.5 - Select most important features (method : threshold feature importance selection)

In [57]:
xgb_reg_en_mif_n_data, xgb_reg_en_mif_n_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
                                                                                method='threshold',
                                                                                model=xgb_reg_en_model,
                                                                                thr='q',
                                                                                q_value=0.75, # Q3
                                                                                v=True)
43/171 selected features, reduction of 25.1%

2.1.6 - Second training cycle (with threshold feature selection method)

In [58]:
# Second training cycle with features reduced (173 -> 43)

xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_en_reduced_data_43f, train_and_test_data_reduced_43f  = run_training_cycle(xgb_reg_en_mif_n_labels,
                                                                                   xgb_reg_en_params,
                                                                                   xgb_reg_en_model_c2) 
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.447
- R2 = 0.843
--------------------------------
Training set performances (CV) :
- RMSE = 0.432
- R2 = 0.852

2.1.7 - Display random decision tree (from threshold feature selection method)

In [59]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_43f['model'], num_trees=5)
plt.show()

2.1.8 - Training curve

In [60]:
# Plot training curve

x_train_std_en_43f = train_and_test_data_reduced_43f['train'][0]


xgb_reg_en_params = {'objective': 'reg:squarederror',
                     'n_estimators': 500,
                     'alpha': 0.5,
                     'learning_rate': 0.062,
                     'max_depth': 20,
                     'colsample_bytree': 0.35}


plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                      x_train_std_en_43f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(200, 800, 200),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.1.9 - Learning curve

In [ ]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                    x_train_std_en_43f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [61]:
xgb_reg_en_reduced_data_43f['learning_potential'] = 'Yes'

2.2 - XGBoost (target : 'TotalGHGEmissions' )

2.2.1 - Train model (optimize hyperparameters)

In [62]:
%%time

# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# xgb_reg_params = {'n_estimators': [150, 200, 250, 300],
#                   'alpha': [2, 5],
#                   'learning_rate': np.arange(0.1, 1.1, 0.1),
#                   'max_depth': [5, 10, 20],
#                   'colsample_bytree': [0.3],
#                   'subsample': np.arange(0.1, 1.1, 0.1)}

xgb_reg_em_params = {'n_estimators': [750],
                     'alpha': [1.6], # selected from np.arange(1, 2, 0.1),
                     'learning_rate': [0.05],
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                     }


xgb_em_data = train_gridsearch(emissions_data,
                               xgb_reg,
                               xgb_reg_em_params)


print(xgb_em_data['model'])
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.673
- R2 = 0.795
--------------------------------
Training set performances (CV) :
- RMSE = 0.701
- R2 = 0.771
XGBRegressor(alpha=1.6, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.35, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=20,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=750, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=1.60000002,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
CPU times: user 10min 23s, sys: 7.89 s, total: 10min 31s
Wall time: 3min 7s

2.2.2 - Select most important features (method : cumulative feature importance selection)

In [63]:
xgb_reg_em_model = xgb_em_data['model']
xgb_reg_em_coefs = xgb_reg_em_model.feature_importances_

xgb_reg_em_features_coefs_df = get_features_importance(training_features,
                                                       xgb_reg_em_coefs,
                                                       abs_coefs=True,
                                                       non_zero_coefs=False,
                                                       verbose=False)

plot_cumulative_features_importance(xgb_reg_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [64]:
xgb_reg_em_mif_79_data, xgb_reg_em_mif_79_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
                                                                                  n=79,
                                                                                  method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_em_mif_79_data,
                    model_name(xgb_reg_em_model),
                    n=10,
                    plot_size=(12, 4))

2.2.3 - Second training cycle (with cumulative feature importance selection method)

In [ ]:
%%time 

# Second training cycle with features reduced (173 -> 79)

xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')


xgb_reg_em_reduced_data_79f, train_and_test_data_reduced_79f  = run_training_cycle(xgb_reg_em_mif_79_labels,
                                                                                   xgb_reg_em_params,
                                                                                   xgb_reg_em_model_c2) 

2.2.4 - Display random decision tree (from cumulative feature importance selection method)

In [ ]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_79f['model'], num_trees=5)
plt.show()

2.2.5 - Select most important features (method : threshold feature importance selection)

In [65]:
xgb_reg_em_mif_n_data, xgb_reg_em_mif_n_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
                                                                                method='threshold',
                                                                                model=xgb_reg_em_model,
                                                                                thr='q',
                                                                                q_value=0.75, # Q3
                                                                                v=True)
43/171 selected features, reduction of 25.1%

2.2.6 - Second training cycle (with threshold feature selection method)

In [66]:
# Second training cycle with features reduced (173 -> 43)

xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_em_reduced_data_43f, train_and_test_data_reduced_43f  = run_training_cycle(xgb_reg_em_mif_n_labels,
                                                                                   xgb_reg_em_params,
                                                                                   xgb_reg_em_model_c2) 
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.441
- R2 = 0.847
--------------------------------
Training set performances (CV) :
- RMSE = 0.451
- R2 = 0.838

2.2.7 - Display random decision tree (from threshold feature selection method)

In [ ]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_43f['model'], num_trees=5)
plt.show()

2.2.8 - Training curve

In [67]:
# Plot training curve

x_train_std_em_43f = train_and_test_data_reduced_43f['train'][0]

xgb_reg_em_params = {'objective': 'reg:squarederror',
                     'n_estimators': 750,
                     'alpha': 1.6,
                     'learning_rate': 0.05,
                     'max_depth': 20,
                     'colsample_bytree': 0.35}

plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                      x_train_std_em_43f,
                      y_train_emissions,
                      'n_estimators',
                      np.arange(200, 800, 200),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.2.9 - Learning curve

In [ ]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                    x_train_std_em_43f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [68]:
xgb_reg_em_reduced_data_43f['learning_potential'] = 'Yes'

3 - Feature Selection : union of the variables selected by the models

In [69]:
# Get union from best models selected features for each target


# Energy best selected features


# Merge best selected features from models
en_total_best_features = enet_en_mif_31_labels + rfr_en_mif_12_labels + xgb_reg_en_mif_n_labels
# Extract best unique selected features from merged list
en_best_features = list(set(en_total_best_features))


# Emissions best selected features


# Merge best selected features from models
em_total_best_features = enet_em_mif_n_labels + rfr_em_mif_18_labels + xgb_reg_em_mif_n_labels
# Extract best unique selected features from merged list
em_best_features = list(set(em_total_best_features))

# Display best features count for each target
print('Energy target has {} best features'.format(len(en_best_features)))
print('Emissions target has {} best features'.format(len(em_best_features)))
Energy target has 51 best features
Emissions target has 52 best features

4 - Non linear models : Kernel SVR, MLPR

N.B :

Nonlinear models do not allow to measure in a simple way feature contributions from models

We will therefore proceed here differently by:

  • training our models from all the features.
  • carrying out a second training cycle based on the most relevant features selected by the best model (for each target)

1.1 Kernel SVR (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Train model (optimize hyperparameters)

In [70]:
%%time

kernel_svr_model = SVR(kernel='rbf')

kernel_svr_en_params = {'C': [250], # 300 # np.arange(50, 550, 50),  np.logspace(-3, 3, 10)
                        'gamma': [0.001],
                        'epsilon': [0.1]}

kernel_svr_en_data = train_gridsearch(energy_data,
                                      kernel_svr_model,
                                      kernel_svr_en_params)
"""
- RMSE = 0.538
- R2 = 0.772
"""
kernel_svr_en_model = kernel_svr_en_data['model']
kernel_svr_en_model
SVR
--------------------------
Testing set performances :
- RMSE = 0.538
- R2 = 0.772
--------------------------------
Training set performances (CV) :
- RMSE = 0.518
- R2 = 0.785
CPU times: user 1min 37s, sys: 485 ms, total: 1min 37s
Wall time: 1min 38s
Out[70]:
SVR(C=250, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

1.1.2 - Second training cycle (with feature selection)

In [71]:
%%time 

# Second training cycle with features reduced

kernel_svr_en_model = SVR(kernel='rbf')

# We test the different sets of selected features for each model by also adding the union of these
en_selected_feature_sets_dict = {'ElasticNet': enet_en_mif_31_labels,
                                 'RandomForestRegressor': rfr_en_mif_12_labels,
                                 'XGBoostRegressor': xgb_reg_en_mif_n_labels,
                                 'Union of features selected by each model': en_best_features}

def run_training_cycle_for_each_feature_set(feature_sets, model, model_param_grid):
    # Second cycle results dictionary 
    second_cycle_results = {'rmse': [],
                            'r2': [],
                            'time': [],
                            'total selected features': [],
                            'selected features by': []}
    # Run second training cycle for each selected features lists
    for model_of_feature_set, selected_features in feature_sets.items():
        
        model_reduced_data_nf, train_and_test_data_reduced_nf = run_training_cycle(selected_features,
                                                                                   model_param_grid,
                                                                                   model,
                                                                                   v=False)
        second_cycle_results['rmse'].append(model_reduced_data_nf['rmse'])
        second_cycle_results['r2'].append(model_reduced_data_nf['r2'])
        second_cycle_results['time'].append(model_reduced_data_nf['time'])
        second_cycle_results['total selected features'].append(len(selected_features))
        second_cycle_results['selected features by'].append(model_of_feature_set)
    # Build second cycle results
    second_cycle_results_df = pd.DataFrame(second_cycle_results)
    return second_cycle_results_df


kernel_svr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                                kernel_svr_en_model,
                                                                                kernel_svr_en_params)

# Best performances for each feature set
kernel_svr_en_second_cycle_results_df
CPU times: user 55.1 s, sys: 508 ms, total: 55.6 s
Wall time: 55.6 s
Out[71]:
rmse r2 time total selected features selected features by
0 0.554 0.758 13 s 31 ElasticNet
1 0.589 0.727 9 s 12 RandomForestRegressor
2 0.545 0.766 14 s 43 XGBoostRegressor
3 0.543 0.768 18 s 51 Union of features selected by each model
In [ ]:
# Train best Kernel SVR model with appropriate selected features list
kernel_svr_en_reduced_data_12f, train_and_test_data_reduced_12f  = run_training_cycle(rfr_en_mif_12_labels,
                                                                                      kernel_svr_en_params,
                                                                                      kernel_svr_en_model,
                                                                                      v=False)

1.1.3 - Training curve

In [ ]:
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]

kernel_svr_en_params = {'kernel': 'rbf',
                        'C': 250,
                        'gamma': 0.001,
                        'epsilon': 0.1}

plot_validation_curve(SVR(**kernel_svr_en_params),
                      x_train_std_en_12f,
                      y_train_energy,
                      'C',
                      np.arange(50, 550, 150),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

# Fix C hyperparameter to 200
In [72]:
# Train best Kernel SVR model with appropriate selected features list (and C = 200)

kernel_svr_en_params = {'C': [200],
                        'gamma': [0.001],
                        'epsilon': [0.1]}

kernel_svr_en_reduced_data_12f, train_and_test_data_reduced_12f  = run_training_cycle(rfr_en_mif_12_labels,
                                                                                      kernel_svr_en_params,
                                                                                      kernel_svr_en_model,
                                                                                      v=False)

1.1.4 - Learning curve

In [ ]:
# Plot learning curve

x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]

kernel_svr_en_params = {'kernel': 'rbf',
                        'C': 200,
                        'gamma': 0.001,
                        'epsilon': 0.1}

# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_en_params),
                    x_train_std_en_12f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [73]:
kernel_svr_en_reduced_data_12f['learning_potential'] = 'No'

1.2 Kernel SVR (target : 'TotalGHGEmissions' )

1.2.1 - Train model (optimize hyperparameters)

In [74]:
%%time

kernel_svr_model = SVR(kernel='rbf')

kernel_svr_em_params = {'C': [250], # 300
                        'gamma': [0.001], # 0.01
                        'epsilon': [0.1]}

kernel_svr_em_data = train_gridsearch(emissions_data,
                                      kernel_svr_model,
                                      kernel_svr_em_params)
"""
- RMSE = 0.933
- R2 = 0.607
"""

kernel_svr_em_data['model']
SVR
--------------------------
Testing set performances :
- RMSE = 0.933
- R2 = 0.607
--------------------------------
Training set performances (CV) :
- RMSE = 0.949
- R2 = 0.582
CPU times: user 1min 28s, sys: 469 ms, total: 1min 29s
Wall time: 1min 29s
Out[74]:
SVR(C=250, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

1.2.2 - Second training cycle (with feature selection)

In [75]:
%%time 

# Second training cycle with features reduced

kernel_svr_em_model = SVR(kernel='rbf')

# We test the different sets of selected features for each model by also adding the union of these
em_selected_feature_sets_dict = {'ElasticNet': enet_em_mif_n_labels,
                                 'RandomForestRegressor': rfr_em_mif_18_labels,
                                 'XGBoostRegressor': xgb_reg_em_mif_n_labels,
                                 'Union of features selected by each model': em_best_features}

kernel_svr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                                kernel_svr_em_model,
                                                                                kernel_svr_em_params)
# Best performances for each feature set
kernel_svr_em_second_cycle_results_df
CPU times: user 1min 10s, sys: 601 ms, total: 1min 10s
Wall time: 1min 10s
Out[75]:
rmse r2 time total selected features selected features by
0 0.552 0.760 17 s 36 ElasticNet
1 0.572 0.742 12 s 18 RandomForestRegressor
2 0.546 0.766 19 s 43 XGBoostRegressor
3 0.547 0.764 23 s 52 Union of features selected by each model
In [76]:
kernel_svr_em_reduced_data_18f, train_and_test_data_reduced_18f  = run_training_cycle(rfr_em_mif_18_labels,
                                                                                      kernel_svr_em_params,
                                                                                      kernel_svr_em_model,
                                                                                      v=False)

1.2.3 - Training curve

In [ ]:
x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]

kernel_svr_em_params = {'kernel': 'rbf',
                        'C': 250,
                        'gamma': 0.001,
                        'epsilon': 0.1}

plot_validation_curve(SVR(**kernel_svr_em_params),
                      x_train_std_em_18f,
                      y_train_emissions,
                      'C',
                      np.arange(50, 550, 150),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

1.2.4 - Learning curve

In [ ]:
# Plot learning curve

kernel_svr_em_params = {'kernel': 'rbf',
                        'C': 250,
                        'gamma': 0.001,
                        'epsilon': 0.1}

# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_em_params),
                    x_train_std_em_18f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
In [77]:
kernel_svr_em_reduced_data_18f['learning_potential'] = 'No'

2.1 - Multi-Layer Perceptron Regressor (target : 'SiteEnergyUse(kBtu)' )

2.1.1 - Train model (optimize hyperparameters)

In [78]:
%%time 

# Multi-Layer Perceptron Regressor

mlpr_model = MLPRegressor(activation='relu', #'identity',
                          learning_rate='adaptive',
                          alpha=4,
                          max_iter=5000,
                          verbose=False)

mlpr_en_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
                  'learning_rate_init': [0.0008]} # 0.001

mlpr_en_data = train_gridsearch(energy_data,
                                mlpr_model,
                                mlpr_en_params)
mlpr_en_data['model']
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.551
- R2 = 0.761
--------------------------------
Training set performances (CV) :
- RMSE = 0.538
- R2 = 0.769
CPU times: user 6min 42s, sys: 3min 22s, total: 10min 5s
Wall time: 4min 21s
Out[78]:
MLPRegressor(activation='relu', alpha=4, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=3, learning_rate='adaptive',
             learning_rate_init=0.0008, max_fun=15000, max_iter=5000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

2.1.2 - Second training cycle (with feature selection)

In [ ]:
%%time 

# Second training cycle with features reduced

mlpr_en_model = MLPRegressor(activation='relu',
                             learning_rate='adaptive',
                             alpha=4,
                             max_iter=5000,
                             verbose=False)



mlpr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                          mlpr_en_model,
                                                                          mlpr_en_params)
# Best performances for each feature set
mlpr_en_second_cycle_results_df
In [80]:
mlpr_en_reduced_data_12f, train_and_test_data_reduced_12f  = run_training_cycle(rfr_en_mif_12_labels,
                                                                                mlpr_en_params,
                                                                                mlpr_en_model,
                                                                                v=False)

2.1.3 - Training curve

In [ ]:
x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]

mlpr_en_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 5000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.0008}

plot_validation_curve(MLPRegressor(**mlpr_en_params),
                      x_train_std_en_12f,
                      y_train_energy,
                      'learning_rate_init',
                      np.arange(0.0001, 0.001, 0.0001),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')
In [81]:
# Run another training cycle with a learning rate = 0.0001

mlpr_en_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
                  'learning_rate_init': [0.0001]} # 0.001

mlpr_en_reduced_data_12f, train_and_test_data_reduced_12f  = run_training_cycle(rfr_en_mif_12_labels,
                                                                                mlpr_en_params,
                                                                                mlpr_en_model,
                                                                                v=True)
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.593
- R2 = 0.723
--------------------------------
Training set performances (CV) :
- RMSE = 0.576
- R2 = 0.737

2.1.4 - Learning curve

In [82]:
# Plot learning curve

x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]

mlpr_en_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 50000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.0001}

# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_en_params),
                    x_train_std_en_12f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
In [83]:
mlpr_en_reduced_data_12f['learning_potential'] = 'No'

2.2 - Multi-Layer Perceptron Regressor (target : 'TotalGHGEmissions' )

2.2.1 - Train model (optimize hyperparameters)

In [84]:
%%time 

mlpr_model = MLPRegressor(activation='relu', #'identity',
                          learning_rate='adaptive',
                          alpha=4,
                          max_iter=5000,
                          verbose=False)

mlpr_em_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
                  'learning_rate_init': [0.00075]} # 0.001

mlpr_em_data = train_gridsearch(emissions_data,
                                mlpr_model,
                                mlpr_em_params)

mlpr_em_data['model']
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.935
- R2 = 0.605
--------------------------------
Training set performances (CV) :
- RMSE = 0.939
- R2 = 0.59
CPU times: user 2min 51s, sys: 1min 23s, total: 4min 14s
Wall time: 1min 37s
Out[84]:
MLPRegressor(activation='relu', alpha=4, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=16, learning_rate='adaptive',
             learning_rate_init=0.00075, max_fun=15000, max_iter=5000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

2.2.2 - Second training cycle (with feature selection)

In [85]:
%%time 

# Second training cycle with features reduced

mlpr_em_model = MLPRegressor(activation='relu',
                             learning_rate='adaptive',
                             alpha=4,
                             max_iter=5000,
                             verbose=False)



mlpr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                          mlpr_em_model,
                                                                          mlpr_em_params)
# Best performances for each feature set
mlpr_em_second_cycle_results_df
CPU times: user 4min 53s, sys: 3.5 s, total: 4min 56s
Wall time: 4min 50s
Out[85]:
rmse r2 time total selected features selected features by
0 0.566 0.748 1 min 16 s 36 ElasticNet
1 0.581 0.734 1 min 10 s 18 RandomForestRegressor
2 0.566 0.748 1 min 14 s 43 XGBoostRegressor
3 0.567 0.747 1 min 10 s 52 Union of features selected by each model
In [86]:
mlpr_em_reduced_data_18f, train_and_test_data_reduced_18f  = run_training_cycle(rfr_em_mif_18_labels,
                                                                                mlpr_em_params,
                                                                                mlpr_em_model,
                                                                                v=False)

2.2.3 - Training curve

In [87]:
x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]

mlpr_em_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 5000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.00075}

plot_validation_curve(MLPRegressor(**mlpr_em_params),
                      x_train_std_em_18f,
                      y_train_emissions,
                      'learning_rate_init',
                      np.arange(0.0001, 0.001, 0.0001),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')
In [88]:
# 0.0005 seems to be a better value for learning rate 

mlpr_model = MLPRegressor(activation='relu', #'identity',
                          learning_rate='adaptive',
                          alpha=4,
                          max_iter=5000,
                          verbose=False)

mlpr_em_params = {'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': [0.0005]}

mlpr_em_reduced_data_18f, train_and_test_data_reduced_18f  = run_training_cycle(rfr_em_mif_18_labels,
                                                                                mlpr_em_params,
                                                                                mlpr_em_model,
                                                                                v=True)
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.58
- R2 = 0.735
--------------------------------
Training set performances (CV) :
- RMSE = 0.569
- R2 = 0.744

2.2.4 - Learning curve

In [89]:
# Plot learning curve

x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]

mlpr_em_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 5000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.0005}

# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_em_params),
                    x_train_std_em_18f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [90]:
mlpr_em_reduced_data_18f['learning_potential'] = 'Yes'

5 - Third training cycle with models union features : RandomForest, XGBoost

1.1 - Random forest (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Third training cycle (with feature selection)

In [92]:
%%time 

# Third training cycle with features reduced

rfr_en_model = RandomForestRegressor(random_state=42)

rfr_en_params = {'n_estimators': [100],
                 'min_samples_leaf': [1]}

rfr_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                        rfr_en_model,
                                                                        rfr_en_params)
rfr_en_third_cycle_results_df
CPU times: user 44.4 s, sys: 91.9 ms, total: 44.5 s
Wall time: 44.5 s
Out[92]:
rmse r2 time total selected features selected features by
0 0.480 0.819 11 s 31 ElasticNet
1 0.480 0.818 9 s 12 RandomForestRegressor
2 0.490 0.811 10 s 43 XGBoostRegressor
3 0.469 0.826 14 s 51 Union of features selected by each model

1.2 - Random forest (target : 'TotalGHGEmissions')

1.2.1 - Third training cycle (with feature selection)

In [94]:
%%time 

# Third training cycle with features reduced

rfr_em_model = RandomForestRegressor(random_state=42)

rfr_em_params = {'n_estimators': [350],
                 'min_samples_leaf': [1]}

rfr_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                        rfr_em_model,
                                                                        rfr_em_params)
rfr_em_third_cycle_results_df
CPU times: user 3min 8s, sys: 1.26 s, total: 3min 9s
Wall time: 3min 9s
Out[94]:
rmse r2 time total selected features selected features by
0 0.472 0.824 46 s 36 ElasticNet
1 0.474 0.823 39 s 18 RandomForestRegressor
2 0.472 0.824 50 s 43 XGBoostRegressor
3 0.470 0.826 54 s 52 Union of features selected by each model

2.1 - XGBoost (target : 'SiteEnergyUse(kBtu)' )

2.1.1 - Third training cycle (with feature selection)

In [95]:
%%time 

# Third training cycle with features reduced

xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_en_params = {'n_estimators': [500],
                     'alpha': [0.5],
                     'learning_rate': [0.062],
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                     }

xgb_reg_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                            xgb_reg_en_model,
                                                                            xgb_reg_en_params)
xgb_reg_en_third_cycle_results_df
CPU times: user 4min 8s, sys: 3.14 s, total: 4min 11s
Wall time: 1min 11s
Out[95]:
rmse r2 time total selected features selected features by
0 0.424 0.858 17 s 31 ElasticNet
1 0.451 0.840 12 s 12 RandomForestRegressor
2 0.447 0.843 19 s 43 XGBoostRegressor
3 0.427 0.856 23 s 51 Union of features selected by each model
In [96]:
xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_en_reduced_data_12f, train_and_test_data_reduced_12f  = run_training_cycle(rfr_en_mif_12_labels,
                                                                                   xgb_reg_en_params,
                                                                                   xgb_reg_en_model,
                                                                                   v=False)

2.1.2 - Training curve

In [97]:
# Plot training curve

x_train_std_en_12f = train_and_test_data_reduced_12f['train'][0]


xgb_reg_en_params = {'objective': 'reg:squarederror',
                     'n_estimators': 500,
                     'alpha': 0.5,
                     'learning_rate': 0.062,
                     'max_depth': 20,
                     'colsample_bytree': 0.35}


plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                      x_train_std_en_12f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(200, 800, 100),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.1.3 - Learning curve

In [98]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                    x_train_std_en_12f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [99]:
xgb_reg_en_reduced_data_12f['learning_potential'] = 'Yes'

2.2 - XGBoost (target : 'TotalGHGEmissions' )

2.2.1 - Third training cycle (with feature selection)

In [100]:
%%time 

# Third training cycle with features reduced

xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_em_params = {'n_estimators': [750],
                     'alpha': [1.6],
                     'learning_rate': [0.05],
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                     }


xgb_reg_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                            xgb_reg_em_model,
                                                                            xgb_reg_em_params)
xgb_reg_em_third_cycle_results_df
CPU times: user 5min 6s, sys: 3.54 s, total: 5min 10s
Wall time: 1min 29s
Out[100]:
rmse r2 time total selected features selected features by
0 0.443 0.846 19 s 36 ElasticNet
1 0.458 0.835 18 s 18 RandomForestRegressor
2 0.441 0.847 23 s 43 XGBoostRegressor
3 0.445 0.844 30 s 52 Union of features selected by each model
In [101]:
xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_em_reduced_data_18f, train_and_test_data_reduced_18f  = run_training_cycle(rfr_em_mif_18_labels,
                                                                                   xgb_reg_em_params,
                                                                                   xgb_reg_em_model,
                                                                                   v=False)

2.2.2 - Training curve

In [102]:
# Plot training curve

x_train_std_em_18f = train_and_test_data_reduced_18f['train'][0]


xgb_reg_em_params = {'objective': 'reg:squarederror',
                     'n_estimators': 750,
                     'alpha': 1.6,
                     'learning_rate': 0.05,
                     'max_depth': 20,
                     'colsample_bytree': 0.35
                     }

plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                      x_train_std_em_18f,
                      y_train_emissions,
                      'n_estimators',
                      np.arange(200, 800, 100),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.2.3 - Learning curve

In [103]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                    x_train_std_em_18f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [104]:
xgb_reg_em_reduced_data_18f['learning_potential'] = 'Yes'

IV - Results

1 - Training results

In [107]:
# Best models for energy target (model data dictionaries from train_gridsearch wrapper)
best_en_models = [enet_en_reduced_data_31f,
                  rfr_en_reduced_data_12f,
                  xgb_reg_en_reduced_data_12f, #xgb_reg_en_reduced_data_44f,
                  kernel_svr_en_reduced_data_12f,
                  mlpr_en_reduced_data_12f]

# Best models for emissions target (model data dictionaries from train_gridsearch wrapper)
best_em_models = [enet_em_reduced_data_36f,
                  rfr_em_reduced_data_18f,
                  xgb_reg_em_reduced_data_18f, #xgb_reg_em_reduced_data_44f,
                  kernel_svr_em_reduced_data_18f,
                  mlpr_em_reduced_data_18f]

# Build main evaluation variables list from a random model data dictionary
model_data_keys = dict(list(rfr_en_reduced_data_12f.items())[2:])
# --> ['model_name', 'rmse', 'r2', 'time', 'n_features', 'learning_potential']
# Build main evaluation variables for result dataframes
training_results_keys = ['Model', 'RMSE', 'R2', 'Run time', 'Selected features', 'Learning potential']

# Build result dictionaries for each target
energy_training_results = {k: [] for k in training_results_keys}
emissions_training_results = {k: [] for k in training_results_keys}

# Fill result dictionaries with model data for each target
for en_model, em_model in zip(best_en_models, best_em_models):
    for k1, k2 in zip(training_results_keys, model_data_keys):
        energy_training_results[k1].append(en_model[k2])
        emissions_training_results[k1].append(em_model[k2])


target_dicts = [energy_training_results, emissions_training_results]

# Build result dataframes for each target
en_results_df, em_results_df = [pd.DataFrame(target_dict) for target_dict in target_dicts]

1.1 - Training results (target : 'SiteEnergyUse(kBtu)')

In [108]:
en_results_df
# Best model XGBRegressor
Out[108]:
Model RMSE R2 Run time Selected features Learning potential
0 ElasticNetCV 0.559 0.754 1 s 31 No
1 RandomForestRegressor 0.480 0.818 9 s 12 Yes
2 XGBRegressor 0.451 0.840 11 s 12 Yes
3 SVR 0.589 0.727 9 s 12 No
4 MLPRegressor 0.593 0.723 3 min 20 s 12 No

1.2 - Training results (target : 'TotalGHGEmissions')

In [109]:
em_results_df
# Best model XGBRegressor
Out[109]:
Model RMSE R2 Run time Selected features Learning potential
0 ElasticNetCV 0.572 0.743 1 s 36 No
1 RandomForestRegressor 0.474 0.823 16 s 18 Yes
2 XGBRegressor 0.458 0.835 25 s 18 Yes
3 SVR 0.572 0.742 11 s 18 No
4 MLPRegressor 0.580 0.735 1 min 28 s 18 Yes

1.3 - Save results data

In [110]:
# Energy target ('SiteEnergyUse(kBtu)') results
en_results_data = {'en_results_df': en_results_df,
                   'en_best_model': xgb_reg_en_reduced_data_12f,
                   'en_best_features': rfr_en_mif_12_labels}

# Emissions target ('TotalGHGEmissions') results
em_results_data = {'em_results_df': em_results_df,
                   'em_best_model': xgb_reg_em_reduced_data_18f,
                   'em_best_features': rfr_em_mif_18_labels}

# Merge target results into another dictionary
results_data = {'en': en_results_data, 'em': em_results_data}

# Save it as a .pkl file
pickle_data(filename='main_results_no_ENERGYSTARScore_no_main_energy_vars',
            folder='../data/pkl',
            data=results_data,
            method='w')

2 - Data acquisition : best feature selection

In [5]:
rfr_en_mif_12_labels == rfr_em_mif_18_labels
Out[5]:
False

2.1 - Selected features ( target : 'SiteEnergyUse(kBtu)' )

In [7]:
rfr_en_mif_12_labels
Out[7]:
['PropertyGFATotal',
 'North_south_dist',
 'East_west_dist',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'SecondLargestPropertyUseTypeGFA',
 'PrimaryPropertyType_Other',
 'BuildingType_Multifamily LR (1-4)',
 'BuildingType_Multifamily MR (5-9)',
 'PrimaryPropertyType_Self-StorageFacility',
 'PrimaryPropertyType_Warehouse',
 'PrimaryPropertyType_Non-RefrigeratedWarehouse',
 'Neighborhood_Downtown']
In [9]:
# Plot feature importance from best model 
xgb.plot_importance(xgb_reg_en_reduced_data_12f['model'])
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd8e4277890>
In [12]:
# "North_south_dist" --> (built from) Latitude
# "East_west_dist" --> (built from) Longitude

selected_features_energy_target = ['Latitude',
                                   'Longitude',
                                   'PropertyGFATotal',
                                   'BuildingType',
                                   'Neighborhood',
                                   'PrimaryPropertyType',
                                   'SecondLargestPropertyUseTypeGFA']

print(f"Total of variables required : {len(selected_features_energy_target)}")
Total of variables required : 7

2.2 - Selected features ( target : 'TotalGHGEmissions' )

In [10]:
rfr_em_mif_18_labels
Out[10]:
['PropertyGFATotal',
 'North_south_dist',
 'East_west_dist',
 'BuildingType_Multifamily LR (1-4)',
 'SecondLargestPropertyUseTypeGFA',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'PrimaryPropertyType_Other',
 'PrimaryPropertyType_Hotel',
 'BuildingType_Multifamily MR (5-9)',
 'OldBuilding',
 'PrimaryPropertyType_Self-StorageFacility',
 'Neighborhood_East',
 'PrimaryPropertyType_Warehouse',
 'PrimaryPropertyType_SeniorCareCommunity',
 'PrimaryPropertyType_Non-RefrigeratedWarehouse',
 'Neighborhood_Downtown',
 'PrimaryPropertyType_LargeOffice',
 'PrimaryPropertyType_Restaurant']
In [11]:
# Plot feature importance from best model 
xgb.plot_importance(xgb_reg_em_reduced_data_18f['model'])
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd8c77cb210>
In [13]:
# "North_south_dist" --> (built from) Latitude
# "East_west_dist" --> (built from) Longitude
# "OldBuilding" --> (built from) YearBuilt

selected_features_emissions_target = ['Latitude',
                                      'Longitude',
                                      'YearBuilt',
                                      'PropertyGFATotal',
                                      'BuildingType',
                                      'Neighborhood',
                                      'PrimaryPropertyType',
                                      'SecondLargestPropertyUseTypeGFA'] 

print(f"Total of variables required : {len(selected_features_emissions_target)}")
Total of variables required : 8